import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score,accuracy_score,precision_score,recall_score
f1_scorer = make_scorer(f1_score)
data=pd.read_csv("worldhappiness2019.csv")
# Clean up data
X = data.drop(['Happiness_level', 'name', 'Country or region', 'sub-region'], axis=1)
y = data['Happiness_level']
X.head()
| GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | region | |
|---|---|---|---|---|---|---|---|
| 0 | 1.340 | 1.587 | 0.986 | 0.596 | 0.153 | 0.393 | Europe |
| 1 | 1.383 | 1.573 | 0.996 | 0.592 | 0.252 | 0.410 | Europe |
| 2 | 1.488 | 1.582 | 1.028 | 0.603 | 0.271 | 0.341 | Europe |
| 3 | 1.380 | 1.624 | 1.026 | 0.591 | 0.354 | 0.118 | Europe |
| 4 | 1.396 | 1.522 | 0.999 | 0.557 | 0.322 | 0.298 | Europe |
# convert outcome categories into numeric
conditions = [
(y=='Very High'),
(y=='High'),
(y=='Average'),
(y=='Low'),
(y=='Very Low')
]
choices = [5,4,3,2,1]
y_num = np.select(conditions, choices)
Summary statistics of average feature values by happiness categories
hap_order = ["Very High", "High", "Average", "Low", "Very Low"]
data.groupby(['Happiness_level']).mean().loc[hap_order]
| GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | |
|---|---|---|---|---|---|---|
| Happiness_level | ||||||
| Very High | 1.337742 | 1.472355 | 0.965387 | 0.509323 | 0.227516 | 0.208065 |
| High | 1.102000 | 1.396687 | 0.860344 | 0.444719 | 0.156250 | 0.082625 |
| Average | 0.979258 | 1.266710 | 0.788290 | 0.360645 | 0.159516 | 0.075839 |
| Low | 0.644935 | 1.022097 | 0.569645 | 0.360806 | 0.172355 | 0.093677 |
| Very Low | 0.455452 | 0.880161 | 0.438194 | 0.285677 | 0.209516 | 0.093710 |
ax = data.groupby(['Happiness_level']).mean().loc[hap_order].plot(kind="bar", legend=False, stacked=False)
ax.set_ylabel("Value")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title("Mean Features by Happiness level")
Text(0.5, 1.0, 'Mean Features by Happiness level')
The plot above shows a strong correlation between GDP, social support, health life expectany and happiness level. As these category values increase, happiness level also increases. This trend also exists for Freedon to make life choices, but the relationship is not as strong. Generosity and perception of corruption are relatively similar for High, Average, and Low happiness leveles with slight increases in these categories for Very High and Very Low happiness; the correlation between these two variables and happiness level is not as strong as the other variables.
import seaborn as sns
sns.boxplot(x='Happiness_level', y='Generosity', data=data)
plt.title("Generosity by Happiness level")
Text(0.5, 1.0, 'Generosity by Happiness level')
This boxplot supports the above trend: there is very little trend between happiness level and generosity.
sns.boxplot(x='Happiness_level', y='Social support', data=data)
plt.title("Social support")
Text(0.5, 1.0, 'Social support')
This boxplot also supports the trend in the initial plot: the highest mean for social support is among Very High happiness level while the lowest mean is among the Very Low happiness level with a decreasing trend between those happiness categories.
# balance of outcome variable
p2 = data.groupby(['Happiness_level']).count()[['region']]
p2.plot.bar(stacked=True)
plt.title("Outcome Variable Count")
Text(0.5, 1.0, 'Outcome Variable Count')
This barplot shows the dispersion, or number of observations, by happiness level. The categories are relatively evenly distributed.
## correlation matrix
# convert outcome categories into numeric
conditions = [
(data.Happiness_level=='Very High'),
(data.Happiness_level=='High'),
(data.Happiness_level=='Average'),
(data.Happiness_level=='Low'),
(data.Happiness_level=='Very Low')
]
choices = [5,4,3,2,1]
data['Happiness_level_num'] = np.select(conditions, choices)
corr_mat = data.corr()
mask = np.triu(np.ones_like(corr_mat, dtype=bool))
sns.heatmap(corr_mat, mask=mask, annot=True)
plt.title("Correlation Matrix")
Text(0.5, 1.0, 'Correlation Matrix')
Lastly, I include a correlation matrix. The lighter shade represents a higher correlation between two variables while a darker shade indicates a lower corrleation. The most important row to analyze is the final one showing variable correlation between happiness level and the covariates. Happiness level is most correlated with GDP per capital and life expectancy both at 0.79, followed by social support at 0.74. As identified earlier, generosity has the lowest correlation of 0.028.
# Set up training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_num, test_size=.33, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_train.columns.tolist())
(104, 7) (104,) ['GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'region']
# # categorical (labeled) outcome variable
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)
# print(X_train.shape)
# print(y_train.shape)
# print(X_train.columns.tolist())
# preprocessor
numeric_features=X.columns.tolist()
numeric_features.remove('region')
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_features = ['region']
#Replacing missing values with Modal value and then one hot encoding.
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
# final preprocessor object set up with ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
preprocess=preprocessor.fit(X_train)
def preprocessor(data):
preprocessed_data=preprocess.transform(data)
return preprocessed_data
# Check shape for keras input:
preprocessor(X_train).shape # pretty small dataset
(104, 11)
The outcome variable, happiness level, is a 5 class outcome variable. To examine feature importance, I run a random forest classifier to get an idea of what features hold the highest gini-importance. Similar to the trends discovered in the bivariate analyses above, GDP per captia, Social support, and life expectancy rate hold the highest values in gini-importance.
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
# fit the model
model.fit(preprocessor(X_train), y_train)
# get importance
importance = model.feature_importances_
# summarize feature importance
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X_train.columns, model.feature_importances_):
feats[feature] = importance #add the name/value pair
plt.bar(*zip(*feats.items()))
plt.xticks(rotation = 90)
plt.title("Gini-importance")
plt.show()
Another way to understand feature importance is with Principal Component Analysis.
This initial plot below shows that nearly 80% of the variance in the data is explained by the first two principal components, and nearly 90% with the first four.
pca = PCA().fit(preprocessor(X_train))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');
This plot below shows a 2D PCA scatter plot of the first two components. The point color corresponds to the outcome variable, happiness level, with 5 (yellow) being Very High and 1 (purple) being Very Low
# code for this plot: https://plotly.com/python/pca-visualization/
import plotly.express as px
from sklearn.decomposition import PCA
df = X_train[['GDP per capita', 'Social support', 'Healthy life expectancy',
'Freedom to make life choices', 'Generosity',
'Perceptions of corruption']]
pca = PCA(n_components=2)
components = pca.fit_transform(df)
fig = px.scatter(components, x=0, y=1, color=y_train)
fig.show()
The plot below is a biplot, showing the importance of feature by eigenvector. A higher magnitude eigenvector suggests a feature with greater importance.
GDP per captial and Social support are of the largest magnitude suggesting they are more important features.
# code for this plot: https://plotly.com/python/pca-visualization/
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
features = ['GDP per capita', 'Social support', 'Healthy life expectancy',
'Freedom to make life choices', 'Generosity',
'Perceptions of corruption']
df = X_train[features]
pca = PCA(n_components=2)
components = pca.fit_transform(df)
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
fig = px.scatter(components, x=0, y=1, color=y_train)
for i, feature in enumerate(features):
fig.add_shape(
type='line',
x0=0, y0=0,
x1=loadings[i, 0],
y1=loadings[i, 1]
)
fig.add_annotation(
x=loadings[i, 0],
y=loadings[i, 1],
ax=0, ay=0,
xanchor="center",
yanchor="bottom",
text=feature,
)
fig.show()
GDP per capita and Social support are the two most important variables in Principal Component 1.
most_import = abs( pca.components_ )
for i,j in zip(X_train.columns[0:2],most_import[0][:2]):
print(i,j)
GDP per capita 0.7538426999830324 Social support 0.4995023449423256
Keras
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
import keras
from keras.optimizers import SGD
import tensorflow as tf
# Builds model
def compile_model(n = 8, learn_rate = 0.01):
auc = tf.keras.metrics.AUC(multi_label=True)
# Create the Model
model = Sequential()
model.add(Dense(n, input_dim=11, activation='relu'))
model.add(Dense(n, activation='relu'))
model.add(Dense(5, activation='softmax'))
# Compile the Model
optimizer = SGD(lr=learn_rate)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[auc])
return model
from numpy.random import seed
seed(9)
basemodel = compile_model()
auc = tf.keras.metrics.AUC(multi_label=True)
n=[80,96,112,128]
learn_rate = [0.0001, 0.001]
epochs=[10,20,30]
batches = [2, 5, 10]
param_grid = dict(n=n, learn_rate=learn_rate,epochs=epochs,batch_size=batches)
model = KerasClassifier(build_fn = compile_model, verbose=0)
# Use n_jobs=-1 to Parallelize Across Available Processors (to speed it up)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs = -1, scoring='f1_macro')
grid = grid.fit(preprocessor(X_train), y_train)
print("Best score is {:.2f},".format(grid.best_score_),"Best parameter is {}".format(grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("{:.2f} ({:.2f}) with {}".format(mean, stdev, param))
Best score is 0.45, Best parameter is {'batch_size': 2, 'epochs': 30, 'learn_rate': 0.001, 'n': 112}
0.13 (0.08) with {'batch_size': 2, 'epochs': 10, 'learn_rate': 0.0001, 'n': 80}
0.18 (0.07) with {'batch_size': 2, 'epochs': 10, 'learn_rate': 0.0001, 'n': 96}
0.19 (0.14) with {'batch_size': 2, 'epochs': 10, 'learn_rate': 0.0001, 'n': 112}
0.17 (0.06) with {'batch_size': 2, 'epochs': 10, 'learn_rate': 0.0001, 'n': 128}
0.35 (0.08) with {'batch_size': 2, 'epochs': 10, 'learn_rate': 0.001, 'n': 80}
0.33 (0.10) with {'batch_size': 2, 'epochs': 10, 'learn_rate': 0.001, 'n': 96}
0.36 (0.09) with {'batch_size': 2, 'epochs': 10, 'learn_rate': 0.001, 'n': 112}
0.33 (0.06) with {'batch_size': 2, 'epochs': 10, 'learn_rate': 0.001, 'n': 128}
0.14 (0.09) with {'batch_size': 2, 'epochs': 20, 'learn_rate': 0.0001, 'n': 80}
0.20 (0.07) with {'batch_size': 2, 'epochs': 20, 'learn_rate': 0.0001, 'n': 96}
0.24 (0.12) with {'batch_size': 2, 'epochs': 20, 'learn_rate': 0.0001, 'n': 112}
0.20 (0.05) with {'batch_size': 2, 'epochs': 20, 'learn_rate': 0.0001, 'n': 128}
0.38 (0.08) with {'batch_size': 2, 'epochs': 20, 'learn_rate': 0.001, 'n': 80}
0.36 (0.09) with {'batch_size': 2, 'epochs': 20, 'learn_rate': 0.001, 'n': 96}
0.39 (0.06) with {'batch_size': 2, 'epochs': 20, 'learn_rate': 0.001, 'n': 112}
0.35 (0.09) with {'batch_size': 2, 'epochs': 20, 'learn_rate': 0.001, 'n': 128}
0.13 (0.07) with {'batch_size': 2, 'epochs': 30, 'learn_rate': 0.0001, 'n': 80}
0.22 (0.13) with {'batch_size': 2, 'epochs': 30, 'learn_rate': 0.0001, 'n': 96}
0.19 (0.15) with {'batch_size': 2, 'epochs': 30, 'learn_rate': 0.0001, 'n': 112}
0.19 (0.09) with {'batch_size': 2, 'epochs': 30, 'learn_rate': 0.0001, 'n': 128}
0.45 (0.05) with {'batch_size': 2, 'epochs': 30, 'learn_rate': 0.001, 'n': 80}
0.43 (0.09) with {'batch_size': 2, 'epochs': 30, 'learn_rate': 0.001, 'n': 96}
0.45 (0.09) with {'batch_size': 2, 'epochs': 30, 'learn_rate': 0.001, 'n': 112}
0.44 (0.03) with {'batch_size': 2, 'epochs': 30, 'learn_rate': 0.001, 'n': 128}
0.29 (0.11) with {'batch_size': 5, 'epochs': 10, 'learn_rate': 0.0001, 'n': 80}
0.13 (0.05) with {'batch_size': 5, 'epochs': 10, 'learn_rate': 0.0001, 'n': 96}
0.14 (0.11) with {'batch_size': 5, 'epochs': 10, 'learn_rate': 0.0001, 'n': 112}
0.25 (0.12) with {'batch_size': 5, 'epochs': 10, 'learn_rate': 0.0001, 'n': 128}
0.25 (0.12) with {'batch_size': 5, 'epochs': 10, 'learn_rate': 0.001, 'n': 80}
0.20 (0.09) with {'batch_size': 5, 'epochs': 10, 'learn_rate': 0.001, 'n': 96}
0.29 (0.06) with {'batch_size': 5, 'epochs': 10, 'learn_rate': 0.001, 'n': 112}
0.21 (0.06) with {'batch_size': 5, 'epochs': 10, 'learn_rate': 0.001, 'n': 128}
0.17 (0.09) with {'batch_size': 5, 'epochs': 20, 'learn_rate': 0.0001, 'n': 80}
0.14 (0.10) with {'batch_size': 5, 'epochs': 20, 'learn_rate': 0.0001, 'n': 96}
0.14 (0.08) with {'batch_size': 5, 'epochs': 20, 'learn_rate': 0.0001, 'n': 112}
0.22 (0.08) with {'batch_size': 5, 'epochs': 20, 'learn_rate': 0.0001, 'n': 128}
0.25 (0.09) with {'batch_size': 5, 'epochs': 20, 'learn_rate': 0.001, 'n': 80}
0.31 (0.07) with {'batch_size': 5, 'epochs': 20, 'learn_rate': 0.001, 'n': 96}
0.26 (0.10) with {'batch_size': 5, 'epochs': 20, 'learn_rate': 0.001, 'n': 112}
0.31 (0.10) with {'batch_size': 5, 'epochs': 20, 'learn_rate': 0.001, 'n': 128}
0.18 (0.10) with {'batch_size': 5, 'epochs': 30, 'learn_rate': 0.0001, 'n': 80}
0.11 (0.07) with {'batch_size': 5, 'epochs': 30, 'learn_rate': 0.0001, 'n': 96}
0.16 (0.07) with {'batch_size': 5, 'epochs': 30, 'learn_rate': 0.0001, 'n': 112}
0.19 (0.14) with {'batch_size': 5, 'epochs': 30, 'learn_rate': 0.0001, 'n': 128}
0.36 (0.14) with {'batch_size': 5, 'epochs': 30, 'learn_rate': 0.001, 'n': 80}
0.39 (0.05) with {'batch_size': 5, 'epochs': 30, 'learn_rate': 0.001, 'n': 96}
0.40 (0.03) with {'batch_size': 5, 'epochs': 30, 'learn_rate': 0.001, 'n': 112}
0.41 (0.07) with {'batch_size': 5, 'epochs': 30, 'learn_rate': 0.001, 'n': 128}
0.11 (0.11) with {'batch_size': 10, 'epochs': 10, 'learn_rate': 0.0001, 'n': 80}
0.16 (0.10) with {'batch_size': 10, 'epochs': 10, 'learn_rate': 0.0001, 'n': 96}
0.08 (0.03) with {'batch_size': 10, 'epochs': 10, 'learn_rate': 0.0001, 'n': 112}
0.17 (0.09) with {'batch_size': 10, 'epochs': 10, 'learn_rate': 0.0001, 'n': 128}
0.17 (0.10) with {'batch_size': 10, 'epochs': 10, 'learn_rate': 0.001, 'n': 80}
0.19 (0.02) with {'batch_size': 10, 'epochs': 10, 'learn_rate': 0.001, 'n': 96}
0.13 (0.06) with {'batch_size': 10, 'epochs': 10, 'learn_rate': 0.001, 'n': 112}
0.25 (0.09) with {'batch_size': 10, 'epochs': 10, 'learn_rate': 0.001, 'n': 128}
0.13 (0.05) with {'batch_size': 10, 'epochs': 20, 'learn_rate': 0.0001, 'n': 80}
0.18 (0.05) with {'batch_size': 10, 'epochs': 20, 'learn_rate': 0.0001, 'n': 96}
0.12 (0.11) with {'batch_size': 10, 'epochs': 20, 'learn_rate': 0.0001, 'n': 112}
0.16 (0.06) with {'batch_size': 10, 'epochs': 20, 'learn_rate': 0.0001, 'n': 128}
0.32 (0.10) with {'batch_size': 10, 'epochs': 20, 'learn_rate': 0.001, 'n': 80}
0.33 (0.06) with {'batch_size': 10, 'epochs': 20, 'learn_rate': 0.001, 'n': 96}
0.26 (0.10) with {'batch_size': 10, 'epochs': 20, 'learn_rate': 0.001, 'n': 112}
0.30 (0.09) with {'batch_size': 10, 'epochs': 20, 'learn_rate': 0.001, 'n': 128}
0.14 (0.06) with {'batch_size': 10, 'epochs': 30, 'learn_rate': 0.0001, 'n': 80}
0.15 (0.11) with {'batch_size': 10, 'epochs': 30, 'learn_rate': 0.0001, 'n': 96}
0.13 (0.07) with {'batch_size': 10, 'epochs': 30, 'learn_rate': 0.0001, 'n': 112}
0.18 (0.03) with {'batch_size': 10, 'epochs': 30, 'learn_rate': 0.0001, 'n': 128}
0.27 (0.08) with {'batch_size': 10, 'epochs': 30, 'learn_rate': 0.001, 'n': 80}
0.32 (0.07) with {'batch_size': 10, 'epochs': 30, 'learn_rate': 0.001, 'n': 96}
0.27 (0.09) with {'batch_size': 10, 'epochs': 30, 'learn_rate': 0.001, 'n': 112}
0.26 (0.08) with {'batch_size': 10, 'epochs': 30, 'learn_rate': 0.001, 'n': 128}
grid.best_score_
0.4523931623931624
grid.best_params_
{'batch_size': 2, 'epochs': 30, 'learn_rate': 0.001, 'n': 112}
# refit model with best parameters
model = KerasClassifier(build_fn=compile_model, **grid.best_params_)
model.fit(preprocessor(X_train), y_train)
Epoch 1/30 52/52 [==============================] - 0s 598us/step - loss: 1.6720 - auc_26: 0.3604 Epoch 2/30 52/52 [==============================] - 0s 531us/step - loss: 1.6423 - auc_26: 0.4533 Epoch 3/30 52/52 [==============================] - 0s 513us/step - loss: 1.6329 - auc_26: 0.4918 Epoch 4/30 52/52 [==============================] - 0s 519us/step - loss: 1.6142 - auc_26: 0.5360 Epoch 5/30 52/52 [==============================] - 0s 507us/step - loss: 1.5800 - auc_26: 0.5487 Epoch 6/30 52/52 [==============================] - 0s 531us/step - loss: 1.5586 - auc_26: 0.6577 Epoch 7/30 52/52 [==============================] - 0s 518us/step - loss: 1.5096 - auc_26: 0.6797 Epoch 8/30 52/52 [==============================] - 0s 596us/step - loss: 1.5674 - auc_26: 0.6395 Epoch 9/30 52/52 [==============================] - 0s 523us/step - loss: 1.4782 - auc_26: 0.7305 Epoch 10/30 52/52 [==============================] - 0s 536us/step - loss: 1.4422 - auc_26: 0.7836 Epoch 11/30 52/52 [==============================] - 0s 532us/step - loss: 1.5036 - auc_26: 0.7296 Epoch 12/30 52/52 [==============================] - 0s 546us/step - loss: 1.3525 - auc_26: 0.8176 Epoch 13/30 52/52 [==============================] - 0s 518us/step - loss: 1.3897 - auc_26: 0.8028 Epoch 14/30 52/52 [==============================] - 0s 510us/step - loss: 1.4241 - auc_26: 0.7945 Epoch 15/30 52/52 [==============================] - 0s 525us/step - loss: 1.4128 - auc_26: 0.7823 Epoch 16/30 52/52 [==============================] - 0s 528us/step - loss: 1.3583 - auc_26: 0.8076 Epoch 17/30 52/52 [==============================] - 0s 533us/step - loss: 1.3825 - auc_26: 0.8352 Epoch 18/30 52/52 [==============================] - 0s 506us/step - loss: 1.3967 - auc_26: 0.7952 Epoch 19/30 52/52 [==============================] - 0s 539us/step - loss: 1.3512 - auc_26: 0.8288 Epoch 20/30 52/52 [==============================] - 0s 512us/step - loss: 1.2734 - auc_26: 0.8368 Epoch 21/30 52/52 [==============================] - 0s 527us/step - loss: 1.3262 - auc_26: 0.8303 Epoch 22/30 52/52 [==============================] - 0s 506us/step - loss: 1.3052 - auc_26: 0.8196 Epoch 23/30 52/52 [==============================] - 0s 530us/step - loss: 1.2827 - auc_26: 0.8316 Epoch 24/30 52/52 [==============================] - 0s 505us/step - loss: 1.2952 - auc_26: 0.8251 Epoch 25/30 52/52 [==============================] - 0s 537us/step - loss: 1.2381 - auc_26: 0.8436 Epoch 26/30 52/52 [==============================] - 0s 505us/step - loss: 1.2394 - auc_26: 0.8575 Epoch 27/30 52/52 [==============================] - 0s 529us/step - loss: 1.2390 - auc_26: 0.8482 Epoch 28/30 52/52 [==============================] - 0s 505us/step - loss: 1.2248 - auc_26: 0.8625 Epoch 29/30 52/52 [==============================] - 0s 556us/step - loss: 1.2404 - auc_26: 0.8574 Epoch 30/30 52/52 [==============================] - 0s 514us/step - loss: 1.1616 - auc_26: 0.8452
<tensorflow.python.keras.callbacks.History at 0x7fc1a232ccd0>
y_predictions = model.predict(preprocessor(X_test))
print("test-set accuracy: {:.3f}".format(accuracy_score(y_test, y_predictions)))
print("test-set f1-score: {:.3f}".format(f1_score(y_test, y_predictions, average='macro')))
print("test-set precision score: {:.3f}".format(precision_score(y_test, y_predictions, average='macro')))
print("test-set recall score: {:.3f}".format(recall_score(y_test, y_predictions, average='macro')))
test-set accuracy: 0.442 test-set f1-score: 0.374 test-set precision score: 0.508 test-set recall score: 0.468
/Users/kristenakey/opt/anaconda3/envs/maps/lib/python3.8/site-packages/tensorflow/python/keras/engine/sequential.py:450: UserWarning:
`model.predict_classes()` is deprecated and will be removed after 2021-01-01. Please use instead:* `np.argmax(model.predict(x), axis=-1)`, if your model does multi-class classification (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`, if your model does binary classification (e.g. if it uses a `sigmoid` last-layer activation).
Random Forest
from sklearn.ensemble import RandomForestClassifier
param_grid = {'n_estimators': [300,500,800,1100],
"max_depth":[None,5,10,15,20,30],
"criterion": ['gini', 'entropy']}
# cv = number of folds
gridrf = GridSearchCV(RandomForestClassifier(random_state=9),
param_grid=param_grid, cv=5, scoring='f1_macro')
gridrf.fit(preprocessor(X_train), y_train)
print("best parameters: {}".format(gridrf.best_params_))
print("best mean cross-validation score: {:.3f}".format(gridrf.best_score_))
y_predictions = gridrf.predict(preprocessor(X_test))
print("test-set accuracy: {:.3f}".format(accuracy_score(y_test, y_predictions)))
print("test-set f1-score: {:.3f}".format(f1_score(y_test, y_predictions, average='macro')))
print("test-set precision score: {:.3f}".format(precision_score(y_test, y_predictions, average='macro')))
print("test-set recall score: {:.3f}".format(recall_score(y_test, y_predictions, average='macro')))
best parameters: {'criterion': 'gini', 'max_depth': 5, 'n_estimators': 300}
best mean cross-validation score: 0.588
test-set accuracy: 0.423
test-set f1-score: 0.427
test-set precision score: 0.470
test-set recall score: 0.435
Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
param_grid = {'n_estimators': [300, 400 , 500],
"max_depth":[3,4,5],
"learning_rate":[.001,.01,.1]}
# cv = number of folds
gridgb = GridSearchCV(GradientBoostingClassifier(random_state=9),
param_grid=param_grid, cv=5, scoring='f1_macro')
gridgb.fit(preprocessor(X_train), y_train)
print("best parameters: {}".format(gridgb.best_params_))
print("best mean cross-validation score: {:.3f}".format(gridgb.best_score_))
y_predictions = model.predict(preprocessor(X_test))
print("test-set accuracy: {:.3f}".format(accuracy_score(y_test, y_predictions)))
print("test-set f1-score: {:.3f}".format(f1_score(y_test, y_predictions, average='macro')))
print("test-set precision score: {:.3f}".format(precision_score(y_test, y_predictions, average='macro')))
print("test-set recall score: {:.3f}".format(recall_score(y_test, y_predictions, average='macro')))
best parameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300}
best mean cross-validation score: 0.632
test-set accuracy: 0.442
test-set f1-score: 0.417
test-set precision score: 0.496
test-set recall score: 0.460
/Users/kristenakey/opt/anaconda3/envs/maps/lib/python3.8/site-packages/tensorflow/python/keras/engine/sequential.py:450: UserWarning:
`model.predict_classes()` is deprecated and will be removed after 2021-01-01. Please use instead:* `np.argmax(model.predict(x), axis=-1)`, if your model does multi-class classification (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`, if your model does binary classification (e.g. if it uses a `sigmoid` last-layer activation).
XGBoost
from xgboost import XGBClassifier
param = {'n_estimators': [300, 400 , 500],
"max_depth":[3,4,5],
"eta":[.001,.01,.1]}
# cv = number of folds
gridxgb = GridSearchCV(XGBClassifier(random_state=9),
param_grid=param, cv=5, scoring='f1_macro')
gridxgb.fit(preprocessor(X_train), y_train)
print("best parameters: {}".format(gridxgb.best_params_))
print("best mean cross-validation score: {:.3f}".format(gridxgb.best_score_))
y_predictions = gridxgb.predict(preprocessor(X_test))
print("test-set accuracy: {:.3f}".format(accuracy_score(y_test, y_predictions)))
print("test-set f1-score: {:.3f}".format(f1_score(y_test, y_predictions, average='macro')))
print("test-set precision score: {:.3f}".format(precision_score(y_test, y_predictions, average='macro')))
print("test-set recall score: {:.3f}".format(recall_score(y_test, y_predictions, average='macro')))
best parameters: {'eta': 0.1, 'max_depth': 5, 'n_estimators': 300}
best mean cross-validation score: 0.562
test-set accuracy: 0.385
test-set f1-score: 0.391
test-set precision score: 0.429
test-set recall score: 0.380
SVM
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'gamma':['scale','auto'],
"C":[1,5,8,10,15]}
# cv = number of folds
gridsvc = GridSearchCV(SVC(random_state=9),
param_grid=param_grid, cv=5, scoring='f1_macro')
gridsvc.fit(preprocessor(X_train), y_train)
GridSearchCV(cv=5, estimator=SVC(random_state=9),
param_grid={'C': [1, 5, 8, 10, 15], 'gamma': ['scale', 'auto'],
'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
scoring='f1_macro')
model = SVC(random_state=9, **gridsvc.best_params_).fit(preprocessor(X_train), y_train)
print("best parameters: {}".format(gridsvc.best_params_))
print("best mean cross-validation score: {:.3f}".format(gridsvc.best_score_))
y_predictions = gridsvc.predict(preprocessor(X_test))
print("test-set accuracy: {:.3f}".format(accuracy_score(y_test, y_predictions)))
print("test-set f1-score: {:.3f}".format(f1_score(y_test, y_predictions, average='macro')))
print("test-set precision score: {:.3f}".format(precision_score(y_test, y_predictions, average='macro')))
print("test-set recall score: {:.3f}".format(recall_score(y_test, y_predictions, average='macro')))
best parameters: {'C': 5, 'gamma': 'scale', 'kernel': 'linear'}
best mean cross-validation score: 0.636
test-set accuracy: 0.538
test-set f1-score: 0.532
test-set precision score: 0.594
test-set recall score: 0.550
I predict happiness level with 5 machine learning algorithms. The best performing model is Support Vector Machine with the best mean cross-validation f1-score of .63 and test-set f1-score of .53. The best SVM parameters are:
Below, I include code used to try (unsuccessfully) to improve the best performing SVM model. First, I try modeling with only the most important features, then I try adding additional outside data to the original UN Happiness Index dataset.
What if we only try including the most important features?
X_sm = X[["GDP per capita", "Social support", "Healthy life expectancy"]]
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_num, test_size=.33, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_train.columns.tolist())
(104, 3) (104,) ['GDP per capita', 'Social support', 'Healthy life expectancy']
# adjust preprocessor
numeric_features=X_train.columns.tolist()
# numeric_features.remove('region')
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
# categorical_features = ['region']
#Replacing missing values with Modal value and then one hot encoding.
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
# final preprocessor object set up with ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features)#,
# ('cat', categorical_transformer, categorical_features)])
])
#Fit your preprocessor object
preprocess=preprocessor.fit(X_train)
# Write function to transform data with preprocessor
def preprocessor(data):
preprocessed_data=preprocess.transform(data)
return preprocessed_data
# Check shape for keras input:
preprocessor(X_train).shape # pretty small dataset
(104, 3)
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'gamma':['scale','auto'],
"C":[1,5,8,10,15]}
# cv = number of folds
gridsvc = GridSearchCV(SVC(random_state=9),
param_grid=param_grid, cv=5, scoring='f1_macro')
gridsvc.fit(preprocessor(X_train), y_train)
print("best parameters: {}".format(gridsvc.best_params_))
print("best mean cross-validation score: {:.3f}".format(gridsvc.best_score_))
y_predictions = gridsvc.predict(preprocessor(X_test))
print("test-set accuracy: {:.3f}".format(accuracy_score(y_test, y_predictions)))
print("test-set f1-score: {:.3f}".format(f1_score(y_test, y_predictions, average='macro')))
print("test-set precision score: {:.3f}".format(precision_score(y_test, y_predictions, average='macro')))
print("test-set recall score: {:.3f}".format(recall_score(y_test, y_predictions, average='macro')))
best parameters: {'C': 8, 'gamma': 'auto', 'kernel': 'sigmoid'}
best mean cross-validation score: 0.567
test-set accuracy: 0.385
test-set f1-score: 0.364
test-set precision score: 0.417
test-set recall score: 0.390
The SVM model does not perform nearly as well with only a subset of features, and the best parameters selected are different. While features like generosity and perceptions of corruption are not the most important features in the dataset, they are still useful for predicting happiness levels!
What if we add more data?
Additional data from OECD Better Life Index
oecd = pd.read_csv("oecd_better_life_index.csv")
oecd = oecd[oecd.INEQUALITY=="TOT"][["Country", "Indicator", "Inequality","Value"]]
# clean data
oecd = oecd.pivot_table(index=["Country"], columns ="Indicator" ,values='Value')
oecd.reset_index(drop=False, inplace=True)
oecd.index.names = ['ind']
# join with initial dataset
data=pd.read_csv("worldhappiness2019.csv")
# Clean up final region data
X = data.drop(['Happiness_level', 'name', 'sub-region'], axis=1)
X = X.rename(columns={"Country or region":"Country"})
y = data['Happiness_level']
d = pd.merge(X, oecd, on="Country",how='left', indicator=False)
d = d.drop(["Country"],
axis=1)
d.head()
| GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | region | Air pollution | Dwellings without basic facilities | Educational attainment | ... | Personal earnings | Quality of support network | Rooms per person | Self-reported health | Stakeholder engagement for developing regulations | Student skills | Time devoted to leisure and personal care | Voter turnout | Water quality | Years in education | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.340 | 1.587 | 0.986 | 0.596 | 0.153 | 0.393 | Europe | 6.0 | 0.5 | 88.0 | ... | 42964.0 | 95.0 | 1.9 | 70.0 | 2.2 | 523.0 | 15.17 | 67.0 | 95.0 | 19.8 |
| 1 | 1.383 | 1.573 | 0.996 | 0.592 | 0.252 | 0.410 | Europe | 9.0 | 0.5 | 81.0 | ... | 51466.0 | 95.0 | 1.9 | 71.0 | 2.0 | 504.0 | 15.87 | 86.0 | 95.0 | 19.5 |
| 2 | 1.488 | 1.582 | 1.028 | 0.603 | 0.271 | 0.341 | Europe | 5.0 | 0.0 | 82.0 | ... | 51212.0 | 94.0 | 2.1 | 77.0 | 2.2 | 504.0 | 15.56 | 78.0 | 98.0 | 18.3 |
| 3 | 1.380 | 1.624 | 1.026 | 0.591 | 0.354 | 0.118 | Europe | 3.0 | 0.0 | 77.0 | ... | 61787.0 | 98.0 | 1.6 | 76.0 | 2.1 | 481.0 | NaN | 79.0 | 99.0 | 19.0 |
| 4 | 1.396 | 1.522 | 0.999 | 0.557 | 0.322 | 0.298 | Europe | 14.0 | 0.1 | 78.0 | ... | 52877.0 | 91.0 | 1.9 | 76.0 | 2.6 | 508.0 | NaN | 82.0 | 93.0 | 18.7 |
5 rows × 31 columns
# Set up training and test data
X_train, X_test, y_train, y_test = train_test_split(d, y_num, test_size=.33, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_train.columns.tolist())
(104, 31) (104,) ['GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'region', 'Air pollution', 'Dwellings without basic facilities', 'Educational attainment', 'Employees working very long hours', 'Employment rate', 'Feeling safe walking alone at night', 'Homicide rate', 'Household net adjusted disposable income', 'Household net wealth', 'Housing expenditure', 'Labour market insecurity', 'Life expectancy', 'Life satisfaction', 'Long-term unemployment rate', 'Personal earnings', 'Quality of support network', 'Rooms per person', 'Self-reported health', 'Stakeholder engagement for developing regulations', 'Student skills', 'Time devoted to leisure and personal care', 'Voter turnout', 'Water quality', 'Years in education']
# number of rows of new additional dataset / number of rows of origianl dataset
oecd.shape[0] / data.shape[0]
0.26282051282051283
While this additional dataset is comprehensive, one major problem is that it only has observations on one-quarter of the dataset's countries! That means there is a lot of missing observations. Since I still want to try adding this dataset to the original due to the many additional fields it adds, I try overcoming my missing data problem by grouping by region (continent) and averaging the values for each feature that are present.
X_train = X_train.groupby("region").transform(lambda x: x.fillna(x.mean()))
X_test = X_test.groupby("region").transform(lambda x: x.fillna(x.mean()))
# adjust preprocessor
numeric_features=X_train.columns.tolist()
# numeric_features.remove('region')
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())])
# categorical_features = ['region']
#Replacing missing values with Modal value and then one hot encoding.
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
# final preprocessor object set up with ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features)#,
# ('cat', categorical_transformer, categorical_features)
])
#Fit your preprocessor object
preprocess=preprocessor.fit(X_train)
# Write function to transform data with preprocessor
def preprocessor(data):
preprocessed_data=preprocess.transform(data)
return preprocessed_data
# Check shape for keras input:
preprocessor(X_train).shape # pretty small dataset
(104, 30)
preprocessor(X_train)
array([[ 1.16647491, 0.48633461, 0.22247203, ..., -1.12215647,
0.68896486, -1.04619344],
[-0.05243526, -1.73341456, 0.0426934 , ..., -1.12215647,
0.68896486, -1.04619344],
[-1.24541117, -1.20979789, -1.53350523, ..., 0.73996888,
-1.39035423, 0. ],
...,
[ 1.00143871, 1.06777398, 1.06283257, ..., 0.36754381,
0.377067 , 0.21056205],
[ 0.2847101 , -0.25893573, 0.63220006, ..., -1.12215647,
0.68896486, -1.04619344],
[-0.55461682, -1.30616906, -0.97744577, ..., 0.73996888,
-1.39035423, 0. ]])
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'gamma':['scale','auto'],
"C":[1,5,8,10,15]}
# cv = number of folds
gridsvc = GridSearchCV(SVC(random_state=9),
param_grid=param_grid, cv=5, scoring='f1_macro')
gridsvc.fit(preprocessor(X_train), y_train)
GridSearchCV(cv=5, estimator=SVC(random_state=9),
param_grid={'C': [1, 5, 8, 10, 15], 'gamma': ['scale', 'auto'],
'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
scoring='f1_macro')
y_predictions = gridsvc.predict(preprocessor(X_test))
print("test-set accuracy: {:.3f}".format(accuracy_score(y_test, y_predictions)))
print("test-set f1-score: {:.3f}".format(f1_score(y_test, y_predictions, average='macro')))
print("test-set precision score: {:.3f}".format(precision_score(y_test, y_predictions, average='macro')))
print("test-set recall score: {:.3f}".format(recall_score(y_test, y_predictions, average='macro')))
test-set accuracy: 0.462 test-set f1-score: 0.429 test-set precision score: 0.516 test-set recall score: 0.467
Again, the SVM model does not perform nearly as well as the best performing model. This model performs slightly better than the one with only the three most important features, but it still fails to reach the same scores as the best model with only the origianl data. More data (espeically lots of missing data) is not necessarily helpful for improving a model's performance.
The plot belows shows feature importance with the additional data. Compared to the original dataset, all of the additional features are much less important on the gini-importance scale.
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
# fit the model
model.fit(preprocessor(X_train), y_train)
# get importance
importance = model.feature_importances_
# summarize feature importance
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X_train.columns, model.feature_importances_):
feats[feature] = importance #add the name/value pair
plt.bar(*zip(*feats.items()))
plt.xticks(rotation = 90)
plt.title("Gini-importance")
plt.show()